# Title: Measuring Interethnic Marriage in Africa
# Author: Branden Bohrnsen, bohrnsen@umich.edu
# Date: 1/30/2026
# Description: Clean and prepare IPUMS census data for intermarriage analysis.

library(pacman)
p_load(tidyverse, reshape2, ipumsr, ggpubr, readxl, here, patchwork)
options(scipen = 999)
set.seed(07151129)
here()

ddi <- read_ipums_ddi(here("data", "ipums", "ipumsi_00003.xml"))
data <- read_ipums_micro(ddi)
blocks <- read_excel(here("data", "original", "ethnic group blocks.xlsx")) %>%
  select(EthnicGroup = `Census category`, EthnicBlock = `Ethnic block`) %>% na.omit()

census <- data %>% filter(YEAR %in% c(2000, 2010))

for (var in names(census)[!endsWith(names(census), "LOC")]) {
  tryCatch({
    list <- list()
    list[["names"]] <- names(attr(census[[var]], "labels"))
    list[["keys"]] <- as.numeric(attr(census[[var]], "labels"))


    census[[var]] <- ifelse(
      census[[var]] %in% list[["keys"]],
      list[["names"]][
        match(census[[var]], list[["keys"]])
      ],
      census[[var]]
    )
    },  error = function(e) {
    cat("ERROR in variable", var, ": ", e$message, "\n")
  })
}

census <- census %>% mutate(
  GEOLEV1 = case_when(
    GEOLEV1 == 894001 ~ "Central",
    GEOLEV1 == 894002 ~ "Copperbelt",
    GEOLEV1 == 894003 ~ "Eastern",
    GEOLEV1 == 894004 ~ "Luapula",
    GEOLEV1 == 894005 ~ "Lusaka",
    GEOLEV1 == 894008 ~ "North Western",
    GEOLEV1 == 894009 ~ "Southern",
    GEOLEV1 == 894010 ~ "Western",
    .default = NA
  )) %>% mutate(GEOLEV2 = case_when(
    GEOLEV2 == 894001001 ~ "Chibombo, Kabwe, Kapiri Mposhi, Mkushi",
    GEOLEV2 == 894001002 ~ "Mumbwa",
    GEOLEV2 == 894001003 ~ "Serenje",
    GEOLEV2 == 894002001 ~ "Chililabombwe",
    GEOLEV2 == 894002002 ~ "Chingola",
    GEOLEV2 == 894002003 ~ "Kalulushi",
    GEOLEV2 == 894002004 ~ "Kitwe",
    GEOLEV2 == 894002005 ~ "Luanshya",
    GEOLEV2 == 894002006 ~ "Lufwanyama",
    GEOLEV2 == 894002007 ~ "Mufulira",
    GEOLEV2 == 894002008 ~ "Ndola",
    GEOLEV2 == 894003001 ~ "Chadiza",
    GEOLEV2 == 894003002 ~ "Chipata, Mambwe",
    GEOLEV2 == 894003003 ~ "Katete",
    GEOLEV2 == 894003004 ~ "Lundazi",
    GEOLEV2 == 894003005 ~ "Nyimba, Petauke",
    GEOLEV2 == 894003006 ~ "Chama",
    GEOLEV2 == 894003007 ~ "Chinsali",
    GEOLEV2 == 894003008 ~ "Isoka, Mafinga, Nakonde",
    GEOLEV2 == 894003009 ~ "Mpika",
    GEOLEV2 == 894003010 ~ "Chilubi",
    GEOLEV2 == 894003011 ~ "Kaputa",
    GEOLEV2 == 894003012 ~ "Kasama, Mungwi",
    GEOLEV2 == 894003013 ~ "Luwingu",
    GEOLEV2 == 894003014 ~ "Mbala, Mpulungu",
    GEOLEV2 == 894003015 ~ "Mporokoso",
    GEOLEV2 == 894004001 ~ "Chienge, Nchelenge",
    GEOLEV2 == 894004002 ~ "Kawambwa",
    GEOLEV2 == 894004003 ~ "Mansa, Milenge",
    GEOLEV2 == 894004004 ~ "Mwense",
    GEOLEV2 == 894004005 ~ "Samfya",
    GEOLEV2 == 894005001 ~ "Chongwe, Kafue",
    GEOLEV2 == 894005002 ~ "Luangwa",
    GEOLEV2 == 894005003 ~ "Lusaka",
    GEOLEV2 == 894008001 ~ "Chavuma, Zambezi",
    GEOLEV2 == 894008002 ~ "Ikelenge, Mwinilunga",
    GEOLEV2 == 894008003 ~ "Kabompo",
    GEOLEV2 == 894008004 ~ "Kasempa",
    GEOLEV2 == 894008005 ~ "Mufumbwe",
    GEOLEV2 == 894008006 ~ "Solwezi",
    GEOLEV2 == 894009001 ~ "Choma",
    GEOLEV2 == 894009002 ~ "Gwembe",
    GEOLEV2 == 894009003 ~ "Itezhi Tezhi, Namwala",
    GEOLEV2 == 894009004 ~ "Kalomo, Kazungula",
    GEOLEV2 == 894009005 ~ "Livingstone",
    GEOLEV2 == 894009006 ~ "Mazabuka",
    GEOLEV2 == 894009007 ~ "Monze",
    GEOLEV2 == 894009008 ~ "Siavonga",
    GEOLEV2 == 894009009 ~ "Sinazongwe",
    GEOLEV2 == 894010001 ~ "Kalabo",
    GEOLEV2 == 894010002 ~ "Kaoma",
    GEOLEV2 == 894010003 ~ "Lukulu",
    GEOLEV2 == 894010004 ~ "Mongu",
    GEOLEV2 == 894010005 ~ "Senanga, Shang'ombo",
    GEOLEV2 == 894010006 ~ "Sesheke",
    .default = NA
  )) %>% mutate(
    YEAR_HOUSEHOLD = paste0(YEAR, "_", SERIAL)
  )

census <- census %>% left_join(y = blocks, by = join_by(ETHNICZM == EthnicGroup))

census <- census %>%
  select(
    CensusYear = YEAR,
    YearHousehold = YEAR_HOUSEHOLD,
    PersonID = PERNUM,
    Urban = URBAN,
    Sex = SEX,
    District = GEOLEV2,
    Const = CONSTZM,
    MigYrs = MIGYRS1,
    Age = AGE,
    AgeMarried = AGEMARR,
    Ethnicity = ETHNICZM,
    EthnicBlock,
    SpouseID = SPLOC,
    Polygamous = POLYMAL,
    MarriageStatus = MARST
)

correct_district_mappings <-
  table(census$Const, census$District) %>%
  as.data.frame %>% filter(Freq >= 1000) %>%
  select(Const = Var1, District = Var2)

census <- census %>%
  select(-District) %>%
  left_join(y = correct_district_mappings, join_by(Const))

census <- census %>%
  mutate(
    Province = case_when(
        grepl("Chibombo|Kabwe|Kapiri Mposhi|Mkushi|Mumbwa|Serenje", District) ~ "Central",
        grepl("Chililabombwe|Chingola|Kalulushi|Kitwe|Luanshya|Lufwanyama|Mufulira|Ndola", District) ~ "Copperbelt",
        grepl("Chadiza|Chipata|Katete|Lundazi|Mambwe|Nyimba", District) ~ "Eastern",
        grepl("Chienge|Kawambwa|Mansa|Milenge|Mwense|Nchelenge|Samfya", District) ~ "Luapula",
        grepl("Chongwe|Kafue|Luangwa|Lusaka", District) ~ "Lusaka",
        grepl("Chilubi|Kaputa|Kasama|Luwingu|Mbala|Mporokoso|Mpulungu|Mungwi", District) ~ "Northern",
        grepl("Chavuma|Ikelenge|Kabompo|Kasempa|Mufumbwe|Mwinilunga|Solwezi|Zambezi", District) ~ "North-Western",
        grepl("Choma|Gwembe|Itezhi Tezhi|Kalomo|Kazungula|Livingstone|Mazabuka|Monze|Namwala|Siavonga|Sinazongwe", District) ~ "Southern",
        grepl("Kalabo|Kaoma|Lukulu|Mongu|Senanga|Sesheke|Shang'ombo", District) ~ "Western",
        grepl("Chama|Chinsali|Isoka|Mafinga|Mpika|Nakonde", District) ~ "Muchinga",
        TRUE ~ NA_character_
    )
  )

census <- census %>%
  mutate(Married = as.integer(MarriageStatus == "Married/in union")) %>%
  filter(EthnicBlock != "Drop")

table(census$Married)

census <- census %>%
  mutate(
    Age = case_when(
      Age == "1 year" ~ "1",
      Age == "2 years" ~ "2",
      Age == "Less than 1 year" ~ "0",
      TRUE ~ Age
    )
  ) %>%
  mutate(Age = as.integer(Age))

census <- census %>%
  mutate(
    MigYrs = case_when(
      MigYrs == "1 year (or 1 year or less)" ~ "1",
      MigYrs == "Less than 1 year" ~ "1",
      MigYrs == "2 years" ~ "2",
      MigYrs == "NIU (not in universe)" ~ NA,
      TRUE ~ MigYrs
    )
  ) %>%
  mutate(MigYrs = as.integer(MigYrs))

census <- census %>%
  mutate(AgeMarried = case_when(
    AgeMarried == "NIU (not in universe)" ~ NA,
    TRUE ~ AgeMarried
  ))

census <- census %>%
  mutate(
    SpouseID = as.character(SpouseID),
    PersonID = as.character(PersonID)
  )

census <- census %>% mutate(YearBorn = CensusYear - Age)

census <- census %>%
  mutate(
    InMarket_50s = as.integer(between(YearBorn, 1885, 1941)),
    InMarket_60s = as.integer(between(YearBorn, 1895, 1951)),
    InMarket_70s = as.integer(between(YearBorn, 1905, 1961)),
    InMarket_80s = as.integer(between(YearBorn, 1915, 1971)),
    InMarket_90s = as.integer(between(YearBorn, 1925, 1981)),
    InMarket_00s = as.integer(between(YearBorn, 1935, 1991))
  )

sample <- census %>%
  group_by(YearHousehold) %>%
  filter(Sex == "Male", Married == TRUE) %>%
  inner_join(census,
    by = c("YearHousehold", "SpouseID" = "PersonID"),
    suffix = c("_Head", "_Spouse")
  ) %>%
  rename(SpouseID_Head = PersonID)

all(sample$SpouseID_Head == sample$SpouseID_Spouse)
all(sample$Sex_Head != sample$Sex_Spouse)

sample <- sample %>%
  select(
    YearHousehold, SpouseID_Head, SpouseID_Spouse,
    District = District_Head, Province = Province_Head,
    CensusYear = CensusYear_Head,
    Const = Const_Head,
    MigYrs_Spouse,
    Age_Head, Age_Spouse,
    Ethnicity_Head, Ethnicity_Spouse,
    AgeMarried_Head, AgeMarried_Spouse,
    EthnicBlock_Head, EthnicBlock_Spouse,
    Polygamous_Head,
    Married = Married_Head,
    contains("InMarket")
  )

sample <- sample %>%
  mutate(
    CensusYear = as.integer(CensusYear),
    MigYrs_Spouse = as.integer(MigYrs_Spouse),
    Age_Head = as.integer(Age_Head),
    Age_Spouse = as.integer(Age_Spouse),
    AgeMarried_Head = as.integer(AgeMarried_Head),
    AgeMarried_Spouse = as.integer(AgeMarried_Spouse),
  ) %>%
  ungroup()

age_married_plot_m <- ggplot(sample %>% filter(!is.na(AgeMarried_Head))) +
  geom_density(aes(x = AgeMarried_Head), bw = 1) +
  theme_pubclean() +
  labs(title = "Distribution of Age at First Marriage (M)", x = "Age", y = "Density") +
  geom_vline(aes(xintercept = mean(AgeMarried_Head)), color = "black", linetype = "dashed") +
  annotate("rect",
    xmin = mean(sample$AgeMarried_Head, na.rm = TRUE) - sd(sample$AgeMarried_Head, na.rm = TRUE),
    xmax = mean(sample$AgeMarried_Head, na.rm = TRUE) + sd(sample$AgeMarried_Head, na.rm = TRUE),
    ymin = 0, ymax = .25, alpha = .2, fill = "orange"
  )

age_married_plot_f <- ggplot(sample %>% filter(!is.na(AgeMarried_Spouse))) +
  geom_density(aes(x = AgeMarried_Spouse), bw = 1) +
  theme_pubclean() +
  labs(title = "Distribution of Age at First Marriage (F)", x = "Age", y = "Density",
  caption = "Dashed line is mean, shaded area is +/- one standard deviation.") +
  geom_vline(aes(xintercept = mean(AgeMarried_Spouse)), color = "black", linetype = "dashed") +
  annotate("rect",
    xmin = mean(sample$AgeMarried_Spouse, na.rm = TRUE) - sd(sample$AgeMarried_Spouse, na.rm = TRUE),
    xmax = mean(sample$AgeMarried_Spouse, na.rm = TRUE) + sd(sample$AgeMarried_Spouse, na.rm = TRUE),
    ymin = 0, ymax = .25, alpha = .2, fill = "purple"
  )

age_married_combined <- age_married_plot_m | age_married_plot_f
ggsave(here("figures", "age_married_distribution.png"), age_married_combined, width = 10, height = 5, dpi = 300, create.dir = TRUE)

mean(sample$AgeMarried_Spouse, na.rm = TRUE)
sd(sample$AgeMarried_Spouse, na.rm = TRUE)

mean(sample$AgeMarried_Head, na.rm = TRUE)
sd(sample$AgeMarried_Head, na.rm = TRUE)

sample <- sample %>%
  mutate(
    AgeMarried_Spouse = case_when(
      is.na(AgeMarried_Spouse) ~ 19,
      TRUE ~ AgeMarried_Spouse
    ),
    AgeMarried_Head = case_when(
      is.na(AgeMarried_Head) ~ 25,
      TRUE ~ AgeMarried_Head
    ),
    YearMarried = CensusYear - (Age_Spouse - AgeMarried_Spouse),
    YearsMarried = Age_Spouse - AgeMarried_Spouse
  ) %>%
  mutate(
    YearMarried = case_when(
      YearMarried > CensusYear ~ CensusYear,
      TRUE ~ YearMarried
    ),
    YearsMarried = case_when(
      YearsMarried < 0 ~ 0,
      TRUE ~ YearsMarried
    )
  )

urban_rural <- census %>%
    filter(CensusYear == 2000) %>%
    select(CensusYear, Const, Urban) %>%
    group_by(Const, Urban) %>%
    mutate(Urban = case_when(Urban == "Urban" ~ 1, Urban == "Rural" ~ 0, TRUE ~ NA)) %>%
    group_by(Const) %>%
    summarise(
        UrbanCount = sum(Urban),
        Population = n()
    ) %>%
    mutate(Urbanization = UrbanCount / Population)

urban_rural <- urban_rural %>% select(Const, Urbanization)

census <- census %>% left_join(y = urban_rural, join_by(Const))
sample <- sample %>% left_join(y = urban_rural, join_by(Const))

urbanization_plot <- ggplot(census %>% filter(!is.na(Urban))) +
    geom_density(aes(x = as.numeric(Urbanization), fill = as.factor(Urban)), alpha = 0.5) +
    theme_pubclean() +
    labs(title = "", x = "Urbanization Rate", y = "Density", fill = "IPUMS Coding")
ggsave(here("figures", "urbanization_distribution.png"), urbanization_plot, width = 8, height = 5, dpi = 300, create.dir = TRUE)

census <- census %>% select(-Urban)

sample <- sample %>%
  mutate(
    EthnicOutmarriage = case_when(
      Ethnicity_Head != Ethnicity_Spouse ~ 1,
      Ethnicity_Head == Ethnicity_Spouse ~ 0,
      TRUE ~ NA),
    BlockOutmarriage = case_when(
      EthnicBlock_Head != EthnicBlock_Spouse ~ 1,
      EthnicBlock_Head == EthnicBlock_Spouse ~ 0,
      TRUE ~ NA),
    FullReside = ifelse(
      (YearsMarried < MigYrs_Spouse), 1, 0),
    DecadeMarried = cut(YearMarried, breaks = seq(1921, 2011, by = 10),
      right = FALSE,
      labels = paste(seq(1921, 2001, by = 10),
        seq(1930, 2010, by = 10), sep = "-"))
    )

sample_m <- sample %>%
    select(CensusYear,
        District,
        Province,
        Const,
        Urbanization,
        FullReside,
        Ethnicity = Ethnicity_Head,
        EthnicBlock = EthnicBlock_Head,
        EthnicOutmarriage,
        BlockOutmarriage,
        DecadeMarried,
        YearMarried,
        InMarket_50s = InMarket_50s_Head,
        InMarket_60s = InMarket_60s_Head,
        InMarket_70s = InMarket_70s_Head,
        InMarket_80s = InMarket_80s_Head,
        InMarket_90s = InMarket_90s_Head,
        InMarket_00s = InMarket_00s_Head,
        Poly = Polygamous_Head) %>% mutate(Sex = "M")

sample_f <- sample %>%
    select(CensusYear,
        District,
        Province,
        Const,
        Urbanization,
        Ethnicity = Ethnicity_Spouse,
        EthnicBlock = EthnicBlock_Spouse,
        EthnicOutmarriage,
        BlockOutmarriage,
        FullReside,
        DecadeMarried,
        YearMarried,
        InMarket_50s = InMarket_50s_Spouse,
        InMarket_60s = InMarket_60s_Spouse,
        InMarket_70s = InMarket_70s_Spouse,
        InMarket_80s = InMarket_80s_Spouse,
        InMarket_90s = InMarket_90s_Spouse,
        InMarket_00s = InMarket_00s_Spouse,
        Poly = Polygamous_Head) %>% mutate(Sex = "F")

sample_individuals <- bind_rows(sample_m, sample_f)

na_distribution_plot <- function(df) {
    na_counts <- colSums(is.na(df))
    na_df <- data.frame(variable = names(na_counts), na_count = na_counts) %>%
        filter(na_count > 0)

    ggplot(na_df, aes(x = reorder(variable, -na_count), y = na_count)) +
        geom_bar(stat = "identity", fill = "steelblue") +
        coord_flip() +
        labs(x = "Variable", y = "Count of NA values", title = "NA Value Distribution") +
        theme_minimal()
}

na_plot_sample <- na_distribution_plot(sample)
na_plot_census <- na_distribution_plot(census)
ggsave(here("figures", "na_distribution_sample.png"), na_plot_sample, width = 8, height = 6, dpi = 300, create.dir = TRUE)
ggsave(here("figures", "na_distribution_census.png"), na_plot_census, width = 8, height = 6, dpi = 300, create.dir = TRUE)

sample <- sample %>%
    filter(!DecadeMarried %in% c("1921-1930", "1931-1940", "1941-1950"))
sample_individuals <- sample_individuals %>%
    filter(!DecadeMarried %in% c("1921-1930", "1931-1940", "1941-1950"))

if (!dir.exists(here("data", "clean"))) {
  dir.create(here("data", "clean"), recursive = TRUE)
}

write_csv(sample, here("data", "clean", "sample.csv"))
write_csv(census, here("data", "clean", "census.csv"))
write_csv(sample_individuals, here("data", "clean", "sample_individuals.csv"))

sex_dist_census <- ggplot(census) + geom_bar(aes(x = Sex), fill = "pink") + theme_pubclean() + labs(title = "Census")
sex_dist_sample <- ggplot(sample_individuals) + geom_bar(aes(x = Sex), fill = "pink") + theme_pubclean() + labs(title = "Sample")
sex_dist_combined <- sex_dist_census + sex_dist_sample
ggsave(here("figures", "sex_distribution.png"), sex_dist_combined, width = 10, height = 5, dpi = 300, create.dir = TRUE)
